***

Opis problemu i zbioru danych

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

Biblioteki wykorzystane do realizacji projektu:

  • survival
  • survminer
  • survMisc
  • dplyr
  • gtools
  • ggplot2
  • corrplot
  • gridExtra
  • extrafont
  • VIM
  • mfp
  • splines
  • CoxR2
  • ipred
  • Rcpp
  • RColorBrewer
  • knitr
  • kableExtra
  • MASS
library(survival)
library(survminer)
library(survMisc)
library(dplyr)
library(gtools)
library(ggplot2)
library(corrplot)
library(gridExtra)
library(extrafont)
library(VIM)
library(extrafont)
library(mfp)
library(splines)
library(CoxR2)
library(ipred)
library(Rcpp)
library(RColorBrewer)
library(knitr)
library(kableExtra)
library(MASS)
data(pbc)
dane <- pbc
dane$cens <- ifelse(dane$status=="0" | dane$status=="1",0,1)
dane <- dane[1:312,]

Prezentacja danych

head(dane,10)
##    id time status trt      age sex ascites hepato spiders edema bili chol
## 1   1  400      2   1 58.76523   f       1      1       1   1.0 14.5  261
## 2   2 4500      0   1 56.44627   f       0      1       1   0.0  1.1  302
## 3   3 1012      2   1 70.07255   m       0      0       0   0.5  1.4  176
## 4   4 1925      2   1 54.74059   f       0      1       1   0.5  1.8  244
## 5   5 1504      1   2 38.10541   f       0      1       1   0.0  3.4  279
## 6   6 2503      2   2 66.25873   f       0      1       0   0.0  0.8  248
## 7   7 1832      0   2 55.53457   f       0      1       0   0.0  1.0  322
## 8   8 2466      2   2 53.05681   f       0      0       0   0.0  0.3  280
## 9   9 2400      2   1 42.50787   f       0      0       1   0.0  3.2  562
## 10 10   51      2   2 70.55989   f       1      0       1   1.0 12.6  200
##    albumin copper alk.phos    ast trig platelet protime stage cens
## 1     2.60    156   1718.0 137.95  172      190    12.2     4    1
## 2     4.14     54   7394.8 113.52   88      221    10.6     3    0
## 3     3.48    210    516.0  96.10   55      151    12.0     4    1
## 4     2.54     64   6121.8  60.63   92      183    10.3     4    1
## 5     3.53    143    671.0 113.15   72      136    10.9     3    0
## 6     3.98     50    944.0  93.00   63       NA    11.0     3    1
## 7     4.09     52    824.0  60.45  213      204     9.7     3    0
## 8     4.00     52   4651.2  28.38  189      373    11.0     3    1
## 9     3.08     79   2276.0 144.15   88      251    11.0     2    1
## 10    2.74    140    918.0 147.25  143      302    11.5     4    1
tail(dane,10)
##      id time status trt      age sex ascites hepato spiders edema bili chol
## 303 303 1250      0   2 60.65982   f       0      1       1     0  1.0  372
## 304 304 1230      0   1 35.53457   f       0      0       0     0  0.5  219
## 305 305 1216      0   2 43.06639   f       0      1       1     0  2.9  426
## 306 306 1216      0   2 56.39151   f       0      1       0     0  0.6  239
## 307 307 1149      0   2 30.57358   f       0      0       0     0  0.8  273
## 308 308 1153      0   1 61.18275   f       0      1       0     0  0.4  246
## 309 309  994      0   2 58.29979   f       0      0       0     0  0.4  260
## 310 310  939      0   1 62.33265   f       0      0       0     0  1.7  434
## 311 311  839      0   1 37.99863   f       0      0       0     0  2.0  247
## 312 312  788      0   2 33.15264   f       0      0       1     0  6.4  576
##     albumin copper alk.phos ast trig platelet protime stage cens
## 303    3.25    108     1190 140   55      248    10.6     4    0
## 304    3.93     22      663  45   75      246    10.8     3    0
## 305    3.61     73     5184 288  144      275    10.6     3    0
## 306    3.45     31     1072  55   64      227    10.7     2    0
## 307    3.56     52     1282 130   59      344    10.5     2    0
## 308    3.58     24      797  91  113      288    10.4     2    0
## 309    2.75     41     1166  70   82      231    10.8     2    0
## 310    3.35     39     1713 171  100      234    10.2     2    0
## 311    3.16     69     1050 117   88      335    10.5     2    0
## 312    3.79    186     2115 136  149      200    10.8     2    0

Ilość braków danych

NA_count <- colSums(is.na(dane))
NA_count
##       id     time   status      trt      age      sex  ascites   hepato 
##        0        0        0        0        0        0        0        0 
##  spiders    edema     bili     chol  albumin   copper alk.phos      ast 
##        0        0        0       28        0        2        0        0 
##     trig platelet  protime    stage     cens 
##       30        4        0        0        0
barplot(NA_count)

___

Imputacja danych i przekształcanie zmiennych

daneKNN <- kNN(dane, variable = c("chol", "copper", "trig", "platelet"), k = 5)

daneKNN <- daneKNN[,1:21,drop = F]
daneKNN$age <- round(daneKNN$age,0)
levels(daneKNN$sex) <- c(1,0)
daneKNN$status <- as.factor(daneKNN$status)
daneKNN$trt <- as.factor(daneKNN$trt)
daneKNN$ascites <- as.factor(daneKNN$ascites)
daneKNN$spiders <- as.factor(daneKNN$spiders)
daneKNN$hepato <- as.factor(daneKNN$hepato)
daneKNN$stage <- as.factor(daneKNN$stage)
daneKNN$edema <- as.factor(daneKNN$edema)
daneKNN$cens <- as.factor(daneKNN$cens)

str(daneKNN)
## 'data.frame':    312 obs. of  21 variables:
##  $ id      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ time    : int  400 4500 1012 1925 1504 2503 1832 2466 2400 51 ...
##  $ status  : Factor w/ 3 levels "0","1","2": 3 1 3 3 2 3 1 3 3 3 ...
##  $ trt     : Factor w/ 2 levels "1","2": 1 1 1 1 2 2 2 2 1 2 ...
##  $ age     : num  59 56 70 55 38 66 56 53 43 71 ...
##  $ sex     : Factor w/ 2 levels "1","0": 2 2 1 2 2 2 2 2 2 2 ...
##  $ ascites : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 2 ...
##  $ hepato  : Factor w/ 2 levels "0","1": 2 2 1 2 2 2 2 1 1 1 ...
##  $ spiders : Factor w/ 2 levels "0","1": 2 2 1 2 2 1 1 1 2 2 ...
##  $ edema   : Factor w/ 3 levels "0","0.5","1": 3 1 2 2 1 1 1 1 1 3 ...
##  $ bili    : num  14.5 1.1 1.4 1.8 3.4 0.8 1 0.3 3.2 12.6 ...
##  $ chol    : int  261 302 176 244 279 248 322 280 562 200 ...
##  $ albumin : num  2.6 4.14 3.48 2.54 3.53 3.98 4.09 4 3.08 2.74 ...
##  $ copper  : int  156 54 210 64 143 50 52 52 79 140 ...
##  $ alk.phos: num  1718 7395 516 6122 671 ...
##  $ ast     : num  137.9 113.5 96.1 60.6 113.2 ...
##  $ trig    : int  172 88 55 92 72 63 213 189 88 143 ...
##  $ platelet: int  190 221 151 183 136 233 204 373 251 302 ...
##  $ protime : num  12.2 10.6 12 10.3 10.9 11 9.7 11 11 11.5 ...
##  $ stage   : Factor w/ 4 levels "1","2","3","4": 4 3 4 4 3 3 3 3 2 4 ...
##  $ cens    : Factor w/ 2 levels "0","1": 2 1 2 2 1 2 1 2 2 2 ...
summary(daneKNN)
##        id              time      status  trt          age        sex    
##  Min.   :  1.00   Min.   :  41   0:168   1:158   Min.   :26.00   1: 36  
##  1st Qu.: 78.75   1st Qu.:1191   1: 19   2:154   1st Qu.:42.00   0:276  
##  Median :156.50   Median :1840   2:125           Median :50.00          
##  Mean   :156.50   Mean   :2006                   Mean   :50.03          
##  3rd Qu.:234.25   3rd Qu.:2697                   3rd Qu.:57.00          
##  Max.   :312.00   Max.   :4556                   Max.   :78.00          
##  ascites hepato  spiders edema          bili             chol       
##  0:288   0:152   0:222   0  :263   Min.   : 0.300   Min.   : 120.0  
##  1: 24   1:160   1: 90   0.5: 29   1st Qu.: 0.800   1st Qu.: 252.0  
##                          1  : 20   Median : 1.350   Median : 309.0  
##                                    Mean   : 3.256   Mean   : 364.8  
##                                    3rd Qu.: 3.425   3rd Qu.: 394.2  
##                                    Max.   :28.000   Max.   :1775.0  
##     albumin         copper          alk.phos            ast        
##  Min.   :1.96   Min.   :  4.00   Min.   :  289.0   Min.   : 26.35  
##  1st Qu.:3.31   1st Qu.: 41.75   1st Qu.:  871.5   1st Qu.: 80.60  
##  Median :3.55   Median : 73.00   Median : 1259.0   Median :114.70  
##  Mean   :3.52   Mean   : 97.73   Mean   : 1982.7   Mean   :122.56  
##  3rd Qu.:3.80   3rd Qu.:123.25   3rd Qu.: 1980.0   3rd Qu.:151.90  
##  Max.   :4.64   Max.   :588.00   Max.   :13862.4   Max.   :457.25  
##       trig           platelet        protime      stage   cens   
##  Min.   : 33.00   Min.   : 62.0   Min.   : 9.00   1: 16   0:187  
##  1st Qu.: 84.75   1st Qu.:200.0   1st Qu.:10.00   2: 67   1:125  
##  Median :107.50   Median :256.0   Median :10.60   3:120          
##  Mean   :122.34   Mean   :261.9   Mean   :10.73   4:109          
##  3rd Qu.:146.00   3rd Qu.:322.5   3rd Qu.:11.10                  
##  Max.   :598.00   Max.   :563.0   Max.   :17.10

Graficzna prezentacja danych oraz wybrane statystyki opisowe

Rozkłady zmiennych ilościowych

a <- ggplot(data = daneKNN, aes(x = age)) + geom_histogram(alpha = 0.7, binwidth = 5, fill="darkorange") + theme_minimal() +
  ggtitle('Age') + 
  theme(axis.title.y = element_text(color="Grey23", size=11),
        axis.text.y = element_text(size=9),
        axis.title.x = element_blank(),
        plot.title = element_text(color="gray26", size=18, family="serif")) +
        labs(y = "N")
b <- ggplot(data = daneKNN, aes(x = bili)) + geom_histogram(alpha = 0.7, binwidth = 2, fill="mediumpurple3") + theme_minimal() +
  ggtitle('Bilirunbin') + 
  theme(axis.title.y = element_text(color="Grey23", size=11),
        axis.text.y = element_text(size=9),
        axis.title.x = element_blank(),
        plot.title = element_text(color="gray26", size=18, family="serif")) +
  labs(y = "N")
c <- ggplot(data = daneKNN, aes(x = chol)) + geom_histogram(alpha = 0.7, binwidth = 50, fill="seagreen4") + theme_minimal() +
  ggtitle('Cholesterol') + 
  theme(axis.title.y = element_text(color="Grey23", size=11),
        axis.text.y = element_text(size=9),
        axis.title.x = element_blank(),
        plot.title = element_text(color="gray26", size=18, family="serif")) +
  labs(y = "N")
d <- ggplot(data = daneKNN, aes(x = albumin)) + geom_histogram(alpha = 0.7, binwidth = 0.2, fill="dodgerblue3") + theme_minimal() +
  ggtitle('Albumin') + 
  theme(axis.title.y = element_text(color="Grey23", size=11),
        axis.text.y = element_text(size=9),
        axis.title.x = element_blank(),
        plot.title = element_text(color="gray26", size=18, family="serif")) +
  labs(y = "N")
a1 <- ggplot(data = daneKNN, aes(x = copper)) + geom_histogram(alpha = 0.7, binwidth = 30, fill="indianred4") + theme_minimal() +
  ggtitle('Copper') + 
  theme(axis.title.y = element_text(color="Grey23", size=11),
        axis.text.y = element_text(size=9),
        axis.title.x = element_blank(),
        plot.title = element_text(color="gray26", size=18, family="serif")) +
  labs(y = "N")
b1 <- ggplot(data = daneKNN, aes(x = alk.phos)) + geom_histogram(alpha = 0.7, binwidth = 400, fill="royalblue3") + theme_minimal() +
  ggtitle('Alkaline phos.') + 
  theme(axis.title.y = element_text(color="Grey23", size=11),
        axis.text.y = element_text(size=9),
        axis.title.x = element_blank(),
        plot.title = element_text(color="gray26", size=20, family="serif")) +
  labs(y = "N")
c1 <- ggplot(data = daneKNN, aes(x = ast)) + geom_histogram(alpha = 0.7, binwidth = 20, fill="goldenrod2") + theme_minimal() +
  ggtitle('Aspartate amino.') + 
  theme(axis.title.y = element_text(color="Grey23", size=11),
        axis.text.y = element_text(size=9),
        axis.title.x = element_blank(),
        plot.title = element_text(color="gray26", size=20, family="serif")) +
  labs(y = "N")
d1 <- ggplot(data = daneKNN, aes(x = trig)) + geom_histogram(alpha = 0.7, binwidth = 20, fill="cadetblue3") + theme_minimal() +
  ggtitle('Triglycerides') + 
  theme(axis.title.y = element_text(color="Grey23", size=11),
        axis.text.y = element_text(size=9),
        axis.title.x = element_blank(),
        plot.title = element_text(color="gray26", size=20, family="serif")) +
  labs(y = "N")
a2 <- ggplot(data = daneKNN, aes(x = platelet)) + geom_histogram(alpha = 0.7, binwidth = 25, fill="orangered3") + theme_minimal() +
  ggtitle('Platelet') + 
  theme(axis.title.y = element_text(color="Grey23", size=11),
        axis.text.y = element_text(size=9),
        axis.title.x = element_blank(),
        plot.title = element_text(color="gray26", size=20, family="serif")) +
  labs(y = "N")
b2 <- ggplot(data = daneKNN, aes(x = protime)) + geom_histogram(alpha = 0.7, binwidth = 0.5, fill="darkolivegreen4") + theme_minimal() +
  ggtitle('Protime') + 
  theme(axis.title.y = element_text(color="Grey23", size=11),
        axis.text.y = element_text(size=9),
        axis.title.x = element_blank(),
        plot.title = element_text(color="gray26", size=20, family="serif")) +
  labs(y = "N")
c2 <- ggplot(data = daneKNN, aes(x = time, fill = cens)) + geom_histogram(alpha = 0.7, binwidth = 200) + facet_grid(~cens) + theme_minimal() +
  ggtitle('Time for cens & event') + 
  scale_fill_manual(values = c("coral3","slateblue3")) +
  theme(axis.title.y = element_text(color="Grey23", size=11),
        axis.text.y = element_text(size=9),
        axis.title.x = element_blank(),
        plot.title = element_text(color="gray26", size=20, family="serif")) +
  labs(y = "N")

grid.arrange(a,b,c,d)

grid.arrange(a1,b1,c1,d1)

grid.arrange(a2,b2)

c2

Rozkłady zmiennych jakościowych

e <- daneKNN %>%
  count(sex) %>%
  ggplot() + geom_col(aes(x = sex, y = n, fill = sex), alpha = 0.7)+ geom_label(aes(x = sex, y = n, label = n)) + scale_fill_manual(values = c("navajowhite3","plum4")) +
  theme_minimal() +
  ggtitle('Number of patients by gender') + 
  theme(axis.title.y = element_text(color="Grey23", size=12),
        axis.title.x=element_blank(),
        legend.title = element_text(size=12),
        legend.text = element_text(size=8),
        legend.position = "right",
        legend.justification = c(0.94,0.94),
        legend.background = element_rect(fill="grey88",
                                         size=0.5, linetype="solid", 
                                         colour ="gray26"),
        plot.title = element_text(color="gray26", size=19, family="serif"))

f <- daneKNN %>%
  count(trt) %>%
  ggplot() + geom_col(aes(x = trt, y = n, fill = trt), alpha = 0.7)+ geom_label(aes(x = trt, y = n, label = n)) + scale_fill_manual(values = c("lightseagreen","cornsilk3")) +
  theme_minimal() +
  ggtitle('Number of patients by treatment') + 
  theme(axis.title.y = element_text(color="Grey23", size=12),
        axis.title.x=element_blank(),
        legend.title = element_text(size=12),
        legend.text = element_text(size=8),
        legend.position = "right",
        legend.justification = c(0.94,0.94),
        legend.background = element_rect(fill="grey88",
                                         size=0.5, linetype="solid", 
                                         colour ="gray26"),
        plot.title = element_text(color="gray26", size=19, family="serif"))

g <- daneKNN %>%
  count(ascites) %>%
  ggplot() + geom_col(aes(x = ascites, y = n, fill = ascites), alpha = 0.7)+ geom_label(aes(x = ascites, y = n, label = n)) + scale_fill_manual(values = c("darkseagreen4","coral3")) +
  theme_minimal() +
  ggtitle('Number of patients by ascites presence') + 
  theme(axis.title.y = element_text(color="Grey23", size=12),
        axis.title.x=element_blank(),
        legend.title = element_text(size=12),
        legend.text = element_text(size=8),
        legend.position = "right",
        legend.justification = c(0.94,0.94),
        legend.background = element_rect(fill="grey88",
                                         size=0.5, linetype="solid", 
                                         colour ="gray26"),
        plot.title = element_text(color="gray26", size=19, family="serif"))

h <- daneKNN %>%
  count(hepato) %>%
  ggplot() + geom_col(aes(x = hepato, y = n, fill = hepato), alpha = 0.7)+ geom_label(aes(x = hepato, y = n, label = n)) + scale_fill_manual(values = c("palegreen4", "indianred3")) +
  theme_minimal() +
  ggtitle('Number of patients by hepato presence') + 
  theme(axis.title.y = element_text(color="Grey23", size=12),
        axis.title.x=element_blank(),
        legend.title = element_text(size=12),
        legend.text = element_text(size=8),
        legend.position = "right",
        legend.justification = c(0.94,0.94),
        legend.background = element_rect(fill="grey88",
                                         size=0.5, linetype="solid", 
                                         colour ="gray26"),
        plot.title = element_text(color="gray26", size=19, family="serif"))

e1 <- daneKNN %>%
  count(edema) %>%
  ggplot() + geom_col(aes(x = edema, y = n, fill = edema), alpha = 0.7)+ geom_label(aes(x = edema, y = n, label = n)) + scale_fill_manual(values = c("springgreen3","tan3","tomato3")) +
  theme_minimal() +
  ggtitle('Number of patients by edema presence') + 
  theme(axis.title.y = element_text(color="Grey23", size=12),
        axis.title.x=element_blank(),
        legend.title = element_text(size=12),
        legend.text = element_text(size=8),
        legend.position = "right",
        legend.justification = c(0.94,0.94),
        legend.background = element_rect(fill="grey88",
                                         size=0.5, linetype="solid", 
                                         colour ="gray26"),
        plot.title = element_text(color="gray26", size=19, family="serif"))

f1 <- daneKNN %>%
  count(stage) %>%
  ggplot() + geom_col(aes(x = stage, y = n, fill = stage), alpha = 0.7)+ geom_label(aes(x = stage, y = n, label = n)) + scale_fill_manual(values = c("palegreen3", "skyblue3","salmon3","brown3")) +
  theme_minimal() +
  ggtitle('Number of patients by stage of disease') + 
  theme(axis.title.y = element_text(color="Grey23", size=12),
        axis.title.x=element_blank(),
        legend.title = element_text(size=12),
        legend.text = element_text(size=8),
        legend.position = "right",
        legend.justification = c(0.94,0.94),
        legend.background = element_rect(fill="grey88",
                                         size=0.5, linetype="solid", 
                                         colour ="gray26"),
        plot.title = element_text(color="gray26", size=19, family="serif"))

g1 <- daneKNN %>%
  count(spiders) %>%
  ggplot() + geom_col(aes(x = spiders, y = n, fill = spiders), alpha = 0.7)+ geom_label(aes(x = spiders, y = n, label = n)) + scale_fill_manual(values = c("forestgreen", "chocolate3")) +
  theme_minimal() +
  ggtitle('Number of patients spiders presence') + 
  theme(axis.title.y = element_text(color="Grey23", size=12),
        axis.title.x=element_blank(),
        legend.title = element_text(size=12),
        legend.text = element_text(size=8),
        legend.position = "right",
        legend.justification = c(0.94,0.94),
        legend.background = element_rect(fill="grey88",
                                         size=0.5, linetype="solid", 
                                         colour ="gray26"),
        plot.title = element_text(color="gray26", size=19, family="serif"))

h1 <- daneKNN %>%
  count(cens) %>%
  ggplot() + geom_col(aes(x = cens, y = n, fill = cens), alpha = 0.7)+ geom_label(aes(x = cens, y = n, label = n)) + scale_fill_manual(values = c("coral3","slateblue3")) +
  theme_minimal() +
  ggtitle('Number of patients by cens or event') + 
  theme(axis.title.y = element_text(color="Grey23", size=12),
        axis.title.x=element_blank(),
        legend.title = element_text(size=12),
        legend.text = element_text(size=8),
        legend.position = "right",
        legend.justification = c(0.94,0.94),
        legend.background = element_rect(fill="grey88",
                                         size=0.5, linetype="solid", 
                                         colour ="gray26"),
        plot.title = element_text(color="gray26", size=19, family="serif"))

grid.arrange(e,f)

grid.arrange(g,h)

grid.arrange(e1,f1)

grid.arrange(g1,h1)

Rozkłady pokazujące wybrane zależności

z <- ggplot(daneKNN, aes(x=stage, y = age, fill=stage)) + geom_boxplot(size=1.2, alpha=0.5) +
  scale_fill_manual(values = c("palegreen3", "skyblue3","salmon3","brown3")) + theme_minimal() +
  ggtitle("Age by stage") +
  theme(axis.title.y = element_text(color="Grey23", size=12),
        axis.title.x=element_blank(),
        legend.title = element_text(size=12),
        legend.text = element_text(size=8),
        legend.position = "right",
        legend.justification = c(0.94,0.94),
        legend.background = element_rect(fill="grey88",
                                         size=0.5, linetype="solid", 
                                         colour ="gray26"),
        plot.title = element_text(color="gray26", size=18, family="serif")) +
  labs(y = "Age")

x <- ggplot(daneKNN, aes(x=stage, y = bili, fill=stage)) + geom_boxplot(size=1.2, alpha=0.5) +
  scale_fill_manual(values = c("palegreen3", "skyblue3","salmon3","brown3")) + theme_minimal() +
  ggtitle("Bilirunbin by stage") +
  theme(axis.title.y = element_text(color="Grey23", size=12),
        axis.title.x=element_blank(),
        legend.title = element_text(size=12),
        legend.text = element_text(size=8),
        legend.position = "right",
        legend.justification = c(0.94,0.94),
        legend.background = element_rect(fill="grey88",
                                         size=0.5, linetype="solid", 
                                         colour ="gray26"),
        plot.title = element_text(color="gray26", size=18, family="serif")) +
  labs(y = "Bilirunbin")
z1 <- ggplot(daneKNN, aes(x=stage, y = time, fill=stage)) + geom_boxplot(size=1.2, alpha=0.5) +
  scale_fill_manual(values = c("palegreen3", "skyblue3","salmon3","brown3")) + theme_minimal() +
  ggtitle("Time by stage") +
  theme(axis.title.y = element_text(color="Grey23", size=12),
        axis.title.x=element_blank(),
        legend.title = element_text(size=12),
        legend.text = element_text(size=8),
        legend.position = "right",
        legend.justification = c(0.94,0.94),
        legend.background = element_rect(fill="grey88",
                                         size=0.5, linetype="solid", 
                                         colour ="gray26"),
        plot.title = element_text(color="gray26", size=18, family="serif")) +
  labs(y = "Time") + facet_grid(~cens)
x1 <- ggplot(daneKNN, aes(x=ascites, y = bili, fill=ascites)) + geom_boxplot(size=1.2, alpha=0.5) +
  scale_fill_manual(values = c("darkseagreen4","coral3")) + theme_minimal() +
  ggtitle("Bilirunbin by ascites presence") +
  theme(axis.title.y = element_text(color="Grey23", size=12),
        axis.title.x=element_blank(),
        legend.title = element_text(size=12),
        legend.text = element_text(size=8),
        legend.position = "right",
        legend.justification = c(0.94,0.94),
        legend.background = element_rect(fill="grey88",
                                         size=0.5, linetype="solid", 
                                         colour ="gray26"),
        plot.title = element_text(color="gray26", size=18, family="serif")) +
  labs(y = "Bilirunbin")

x2 <- ggplot(daneKNN, aes(x=hepato, y = bili, fill=hepato)) + geom_boxplot(size=1.2, alpha=0.5) +
  scale_fill_manual(values = c("palegreen4", "indianred3")) + theme_minimal() +
  ggtitle("Bilirunbin by hepato presence") +
  theme(axis.title.y = element_text(color="Grey23", size=12),
        axis.title.x=element_blank(),
        legend.title = element_text(size=12),
        legend.text = element_text(size=8),
        legend.position = "right",
        legend.justification = c(0.94,0.94),
        legend.background = element_rect(fill="grey88",
                                         size=0.5, linetype="solid", 
                                         colour ="gray26"),
        plot.title = element_text(color="gray26", size=18, family="serif")) +
  labs(y = "Bilirunbin")

z2 <- ggplot(daneKNN, aes(x=hepato, y = copper, fill=hepato)) + geom_boxplot(size=1.2, alpha=0.5) +
  scale_fill_manual(values = c("palegreen4", "indianred3")) + theme_minimal() +
  ggtitle("Copper by hepato presence") +
  theme(axis.title.y = element_text(color="Grey23", size=12),
        axis.title.x=element_blank(),
        legend.title = element_text(size=12),
        legend.text = element_text(size=8),
        legend.position = "right",
        legend.justification = c(0.94,0.94),
        legend.background = element_rect(fill="grey88",
                                         size=0.5, linetype="solid", 
                                         colour ="gray26"),
        plot.title = element_text(color="gray26", size=18, family="serif")) +
  labs(y = "Copper")

grid.arrange(z,x)

grid.arrange(z1,x1)

grid.arrange(z2,x2)

Statystyki względem grup

Statystyki dla wieku względem płci
tab1 <- daneKNN %>%
  group_by(sex) %>%
  summarise(N = n(), Mean = round(mean(age),2), Min = min(age), Max = max(age), Sd = round(sd(age),2))
  
tab1 %>%
  kbl() %>%
  kable_styling(full_width = T, html_font = "Cambria", fixed_thead = T, font_size = 16)
sex N Mean Min Max Sd
1 36 56.19 33 78 11.54
0 276 49.22 26 77 10.20
Statystyki dla bilirunbiny względem obecności edemy
tab2 <- daneKNN %>%
  group_by(edema) %>%
  summarise(N = n(), Mean = round(mean(bili),2), Min = min(bili), Max = max(bili), Sd = round(sd(bili),2))

tab2 %>%
  kbl() %>%
  kable_styling(full_width = T, html_font = "Cambria", fixed_thead = T, font_size = 16)
edema N Mean Min Max Sd
0 263 2.52 0.3 25.5 3.35
0.5 29 5.76 0.6 28.0 7.24
1 20 9.26 0.8 22.5 7.00
Statystyki dla bilirunbiny względem stopnia rozwoju choroby
tab3 <- daneKNN %>%
  group_by(stage) %>%
  summarise(N = n(), Mean = round(mean(bili),2), Min = min(bili), Max = max(bili), Sd = round(sd(bili),2))

tab3 %>%
  kbl() %>%
  kable_styling(full_width = T, html_font = "Cambria", fixed_thead = T, font_size = 16)
stage N Mean Min Max Sd
1 16 1.07 0.5 6.0 1.33
2 67 2.13 0.3 25.5 3.82
3 120 2.90 0.3 28.0 4.34
4 109 4.66 0.6 24.5 5.05
Statystyki dla poziomu cholesterolu względem płci
tab4 <- daneKNN %>%
  group_by(sex) %>%
  summarise(N = n(), Mean = round(mean(chol),2), Min = min(chol), Max = max(chol), Sd = round(sd(chol),2))

tab4 %>%
  kbl() %>%
  kable_styling(full_width = T, html_font = "Cambria", fixed_thead = T, font_size = 16)
sex N Mean Min Max Sd
1 36 357.61 151 1000 178.80
0 276 365.79 120 1775 228.08

Tu będzie tekst

Statystyki ilości trójglicerydów względem płci
tab5 <- daneKNN %>%
  group_by(sex) %>%
  summarise(N = n(), Mean = round(mean(trig),2), Min = min(trig), Max = max(trig), Sd = round(sd(trig),2))

tab5 %>%
  kbl() %>%
  kable_styling(full_width = T, html_font = "Cambria", fixed_thead = T, font_size = 16)
sex N Mean Min Max Sd
1 36 132.56 49 242 51.68
0 276 121.00 33 598 64.12
Statystyki ilości trójglicerydów względem stopnia rozwoju choroby
tab6 <- daneKNN %>%
  group_by(stage) %>%
  summarise(N = n(), Mean = round(mean(trig),2), Min = min(trig), Max = max(trig), Sd = round(sd(trig),2))

tab6 %>%
  kbl() %>%
  kable_styling(full_width = T, html_font = "Cambria", fixed_thead = T, font_size = 16)
stage N Mean Min Max Sd
1 16 90.75 55 188 32.22
2 67 112.63 44 319 53.19
3 120 128.68 46 382 56.91
4 109 125.96 33 598 75.28

Wykres pokazujący korelacje pomiędzy zmiennymi ilościowymi

doKor <- daneKNN[,c(5, 12,13,14,15,16,17,18,19)]
corMat <- cor(doKor)
#corrplot(corMat,method="number", tl.col = 'black')
corrplot(corMat, method = 'color',addCoef.col = 'black', order = 'AOE', tl.col = 'black', col=brewer.pal(n=10, name="PuOr"))


Estymator Kaplana - Meiera

daneKNN$cens <- as.character(daneKNN$cens)
daneKNN$cens <- as.numeric(daneKNN$cens)
Surv(daneKNN$time, daneKNN$cens)
##   [1]  400  4500+ 1012  1925  1504+ 2503  1832+ 2466  2400    51  3762   304 
##  [13] 3577+ 1217  3584  3672+  769   131  4232+ 1356  3445+  673   264  4079 
##  [25] 4127+ 1444    77   549  4509+  321  3839  4523+ 3170  3933+ 2847  3611+
##  [37]  223  3244  2297  4467+ 1350  4453+ 4556+ 3428  4025+ 2256  2576+ 4427+
##  [49]  708  2598  3853  2386  1000  1434  1360  1847  3282  4459+ 2224  4365+
##  [61] 4256+ 3090   859  1487  3992+ 4191  2769  4039+ 1170  3458+ 4196+ 4184+
##  [73] 4190+ 1827  1191    71   326  1690  3707+  890  2540  3574  4050+ 4032+
##  [85] 3358  1657   198  2452+ 1741  2689   460   388  3913+  750   130  3850+
##  [97]  611  3823+ 3820+  552  3581+ 3099+  110  3086  3092+ 3222  3388+ 2583 
## [109] 2504+ 2105  2350+ 3445   980  3395  3422+ 3336+ 1083  2288   515  2033+
## [121]  191  3297+  971  3069+ 2468+  824  3255+ 1037  3239+ 1413   850  2944+
## [133] 2796  3149+ 3150+ 3098+ 2990+ 1297  2106+ 3059+ 3050+ 2419   786   943 
## [145] 2976+ 2615+ 2995+ 1427   762  2891+ 2870+ 1152  2863+  140  2666+  853 
## [157] 2835+ 2475+ 1536  2772+ 2797+  186  2055   264  1077  2721+ 1682  2713+
## [169] 1212  2692+ 2574+ 2301+ 2657+ 2644+ 2624+ 1492  2609+ 2580+ 2573+ 2563+
## [181] 2556+ 2555+ 2241+  974  2527+ 1576   733  2332+ 2456+ 2504+  216  2443+
## [193]  797  2449+ 2330+ 2363+ 2365+ 2357+ 1592+ 2318+ 2294+ 2272+ 2221+ 2090 
## [205] 2081  2255+ 2171+  904  2216+ 2224+ 2195+ 2176+ 2178+ 1786  1080  2168+
## [217]  790  2170+ 2157+ 1235  2050+  597   334  1945+ 2022+ 1978+  999  1967+
## [229]  348  1979+ 1165  1951+ 1932+ 1776+ 1882+ 1908+ 1882+ 1874+  694  1831+
## [241]  837+ 1810+  930  1690  1790+ 1435+  732+ 1785+ 1783+ 1769+ 1457+ 1770+
## [253] 1765+  737+ 1735+ 1701+ 1614+ 1702+ 1615+ 1656+ 1677+ 1666+ 1301+ 1542+
## [265] 1084+ 1614+  179  1191  1363+ 1568+ 1569+ 1525+ 1558+ 1447+ 1349+ 1481+
## [277] 1434+ 1420+ 1433+ 1412+   41  1455+ 1030+ 1418+ 1401+ 1408+ 1234+ 1067+
## [289]  799  1363+  901+ 1329+ 1320+ 1302+  877+ 1321+  533+ 1300+ 1293+  207 
## [301] 1295+ 1271+ 1250+ 1230+ 1216+ 1216+ 1149+ 1153+  994+  939+  839+  788+
daneKNN$cens <- as.character(daneKNN$cens)
daneKNN$cens <- as.numeric(daneKNN$cens)
pbcKM <- survfit(Surv(time, cens) ~ 1, conf.type="plain", data=daneKNN) #KM
pbcKM1 <- survfit(Surv(time, cens) ~ 1, conf.type="log", data=daneKNN)
pbcNELS=survfit(Surv(time,cens) ~ 1, conf.type ="plain", type="fleming-harrington",data=daneKNN)

splots <- list()
splots[[1]] <- ggsurvplot(
  fit = pbcKM, 
  xlab = "Days", 
  ylab = "Overall survival probability with Kaplan - Meier estimator and plain confidence interval")
splots[[2]] <- ggsurvplot(
  fit = pbcKM1, 
  xlab = "Days", 
  ylab = "Overall survival probability with with Kaplan - Meier estimator and log confidence interval",
  palette = "blue")
splots[[3]] <- ggsurvplot(
  fit = pbcNELS, 
  xlab = "Days", 
  ylab = "Overall survival probability with Nelson - Aelen estimator",
  palette = "limegreen")

arrange_ggsurvplots(splots, print = TRUE,
  ncol = 3, nrow = 1)

summary(pbcKM)$table
##   records     n.max   n.start    events     rmean se(rmean)    median   0.95LCL 
##  312.0000  312.0000  312.0000  125.0000 2973.6112  101.8206 3395.0000 3086.0000 
##   0.95UCL 
## 3839.0000
summary(pbcKM1)$table
##   records     n.max   n.start    events     rmean se(rmean)    median   0.95LCL 
##  312.0000  312.0000  312.0000  125.0000 2973.6112  101.8206 3395.0000 3086.0000 
##   0.95UCL 
## 3853.0000
summary(pbcNELS)$table
##   records     n.max   n.start    events     rmean se(rmean)    median   0.95LCL 
##  312.0000  312.0000  312.0000  125.0000 2978.9327  102.2104 3395.0000 3086.0000 
##   0.95UCL 
## 3839.0000
daneKNN$Age_cat=quantcut(daneKNN$age,q=4)
daneKNN$bili_cat=quantcut(daneKNN$bili,q=4)
daneKNN$chol_cat=quantcut(daneKNN$chol,q=4)
daneKNN$albumin_cat=quantcut(daneKNN$albumin,q=4)
daneKNN$copper_cat=quantcut(daneKNN$copper,q=4)
daneKNN$alk_cat=quantcut(daneKNN$alk.phos,q=4)
daneKNN$ast_cat=quantcut(daneKNN$ast,q=4)
daneKNN$trig_cat=quantcut(daneKNN$trig,q=4)
daneKNN$plat_cat=quantcut(daneKNN$platelet,q=4)
daneKNN$pro_cat=quantcut(daneKNN$protime,q=4)

Rozkłady czasu trwania dla grup

groupSplots1 <- list()
groupSplots1[[1]] <- ggsurvplot(survfit(Surv(time, cens) ~ sex, data = daneKNN),
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE,
           risk.table.col = "strata", 
           linetype = "strata",
           surv.median.line = "hv", 
           ggtheme = theme_bw(),
           palette = c("navajowhite3","plum4"))
groupSplots1[[2]] <- ggsurvplot(survfit(Surv(time, cens) ~ trt, data = daneKNN),
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE,
           risk.table.col = "strata", 
           linetype = "strata",
           surv.median.line = "hv", 
           ggtheme = theme_bw(),
           palette = c("lightseagreen","cornsilk3"))

arrange_ggsurvplots(groupSplots1, print = TRUE,
  ncol = 2, nrow = 1, risk.table.height = 0.2)

kmsex <- survfit(Surv(time, cens) ~ sex, data = daneKNN)
summary(kmsex)$table
##       records n.max n.start events    rmean se(rmean) median 0.95LCL 0.95UCL
## sex=1      36    36      36     22 2479.267  281.2595   2386    1297      NA
## sex=0     276   276     276    103 3048.694  109.5150   3428    3170      NA
kmtrt <- survfit(Surv(time, cens) ~ trt, data = daneKNN)
summary(kmtrt)$table
##       records n.max n.start events    rmean se(rmean) median 0.95LCL 0.95UCL
## trt=1     158   158     158     65 2949.313  141.7174   3282    2583      NA
## trt=2     154   154     154     60 3002.749  145.7727   3428    3090      NA
groupSplots2 <- list()
groupSplots2[[1]] <- ggsurvplot(survfit(Surv(time, cens) ~ ascites, data = daneKNN),
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE,
           risk.table.col = "strata", 
           linetype = "strata",
           surv.median.line = "hv", 
           ggtheme = theme_bw(),
           palette = c("darkseagreen4","coral3"))
groupSplots2[[2]] <- ggsurvplot(survfit(Surv(time, cens) ~ spiders, data = daneKNN),
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE,
           risk.table.col = "strata", 
           linetype = "strata",
           surv.median.line = "hv", 
           ggtheme = theme_bw(),
           palette = c("forestgreen", "chocolate3"))

arrange_ggsurvplots(groupSplots2, print = TRUE,
  ncol = 2, nrow = 1, risk.table.height = 0.2)

kmascites <- survfit(Surv(time, cens) ~ ascites, data = daneKNN)
summary(kmascites)$table
##           records n.max n.start events     rmean se(rmean) median 0.95LCL
## ascites=0     288   288     288    102 3165.2653  102.3598   3584    3282
## ascites=1      24    24      24     23  856.1944  201.0002    368     223
##           0.95UCL
## ascites=0      NA
## ascites=1    1191
kmspiders <- survfit(Surv(time, cens) ~ spiders, data = daneKNN)
summary(kmspiders)$table
##           records n.max n.start events    rmean se(rmean) median 0.95LCL
## spiders=0     222   222     222     73 3287.546  113.0711   3839    3282
## spiders=1      90    90      90     52 2153.799  188.2438   1847    1434
##           0.95UCL
## spiders=0      NA
## spiders=1    3244
groupSplots3 <- list()
groupSplots3[[1]] <- ggsurvplot(survfit(Surv(time, cens) ~ hepato, data = daneKNN),
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE,
           risk.table.col = "strata", 
           linetype = "strata",
           surv.median.line = "hv", 
           ggtheme = theme_bw(),
           palette = c("palegreen4", "indianred3"))
groupSplots3[[2]] <- ggsurvplot(survfit(Surv(time, cens) ~ edema, data = daneKNN),
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE,
           risk.table.col = "strata", 
           linetype = "strata",
           surv.median.line = "hv", 
           ggtheme = theme_bw(),
           palette = c("springgreen3","tan3","tomato3"))

arrange_ggsurvplots(groupSplots3, print = TRUE,
  ncol = 2, nrow = 1, risk.table.height = 0.2)

kmhepato <- survfit(Surv(time, cens) ~ hepato, data = daneKNN)
summary(kmhepato)$table
##          records n.max n.start events    rmean se(rmean) median 0.95LCL 0.95UCL
## hepato=0     152   152     152     37 3606.189  129.0008     NA    3584      NA
## hepato=1     160   160     160     88 2372.598  138.0432   2256    1741    3244
kmedema <- survfit(Surv(time, cens) ~ edema, data = daneKNN)
summary(kmedema)$table
##           records n.max n.start events    rmean se(rmean) median 0.95LCL
## edema=0       263   263     263     89 3231.814  104.2501   3584    3244
## edema=0.5      29    29      29     17 2269.608  350.2181   1576    1012
## edema=1        20    20      20     19  673.550  207.7994    299     179
##           0.95UCL
## edema=0        NA
## edema=0.5      NA
## edema=1       971
ggsurvplot(survfit(Surv(time, cens) ~ stage, data = daneKNN),
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE,
           risk.table.col = "strata", 
           linetype = "strata",
           surv.median.line = "hv", 
           ggtheme = theme_bw(),
           palette = c("palegreen3", "skyblue3","salmon3","brown3"))

kmstage <- survfit(Surv(time, cens) ~ stage, data = daneKNN)
summary(kmstage)$table
##         records n.max n.start events    rmean se(rmean) median 0.95LCL 0.95UCL
## stage=1      16    16      16      1 4358.727  188.0922     NA      NA      NA
## stage=2      67    67      67     16 3636.000  179.4216   4079    3839      NA
## stage=3     120   120     120     43 3156.503  153.5608   3428    2847      NA
## stage=4     109   109     109     65 2101.587  173.8259   1682    1165    2796
groupSplots4 <- list()
groupSplots4[[1]] <- ggsurvplot(survfit(Surv(time, cens) ~ trig_cat, data = daneKNN),
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE,
           risk.table.col = "strata", 
           linetype = "strata",
           surv.median.line = "hv", 
           ggtheme = theme_minimal(),
           palette = c("peachpuff3","lightgreen", "paleturquoise4", "orchid4"))
groupSplots4[[2]] <- ggsurvplot(survfit(Surv(time, cens) ~ Age_cat, data = daneKNN),
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE,
           risk.table.col = "strata", 
           linetype = "strata",
           surv.median.line = "hv", 
           ggtheme = theme_gray(),
           palette = c("wheat3","seagreen3", "slateblue2", "sienna3"))

arrange_ggsurvplots(groupSplots4, print = TRUE,
  ncol = 2, nrow = 1, risk.table.height = 0.2)

groupSplots5 <- list()
groupSplots5[[1]] <- ggsurvplot(survfit(Surv(time, cens) ~ bili_cat, data = daneKNN),
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE,
           risk.table.col = "strata", 
           linetype = "strata",
           surv.median.line = "hv", 
           ggtheme = theme_minimal(),
           palette = c("peachpuff3","lightgreen", "paleturquoise4", "orchid4"))
groupSplots5[[2]] <- ggsurvplot(survfit(Surv(time, cens) ~ chol_cat, data = daneKNN),
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE,
           risk.table.col = "strata", 
           linetype = "strata",
           surv.median.line = "hv", 
           ggtheme = theme_gray(),
           palette = c("wheat3","seagreen3", "slateblue2", "sienna3"))

arrange_ggsurvplots(groupSplots5, print = TRUE,
  ncol = 2, nrow = 1, risk.table.height = 0.2)

groupSplots6 <- list()
groupSplots6[[1]] <- ggsurvplot(survfit(Surv(time, cens) ~ albumin_cat, data = daneKNN),
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE,
           risk.table.col = "strata", 
           linetype = "strata",
           surv.median.line = "hv", 
           ggtheme = theme_minimal(),
           palette = c("peachpuff3","lightgreen", "paleturquoise4", "orchid4"))
groupSplots6[[2]] <- ggsurvplot(survfit(Surv(time, cens) ~ copper_cat, data = daneKNN),
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE,
           risk.table.col = "strata", 
           linetype = "strata",
           surv.median.line = "hv", 
           ggtheme = theme_gray(),
           palette = c("wheat3","seagreen3", "slateblue2", "sienna3"))

arrange_ggsurvplots(groupSplots6, print = TRUE,
  ncol = 2, nrow = 1, risk.table.height = 0.2)

groupSplots7 <- list()
groupSplots7[[1]] <- ggsurvplot(survfit(Surv(time, cens) ~ alk_cat, data = daneKNN),
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE,
           risk.table.col = "strata", 
           linetype = "strata",
           surv.median.line = "hv", 
           ggtheme = theme_minimal(),
           palette = c("peachpuff3","lightgreen", "paleturquoise4", "orchid4"))
groupSplots7[[2]] <- ggsurvplot(survfit(Surv(time, cens) ~ ast_cat, data = daneKNN),
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE,
           risk.table.col = "strata", 
           linetype = "strata",
           surv.median.line = "hv", 
           ggtheme = theme_gray(),
           palette = c("wheat3","seagreen3", "slateblue2", "sienna3"))

arrange_ggsurvplots(groupSplots7, print = TRUE,
  ncol = 2, nrow = 1, risk.table.height = 0.2)

groupSplots8 <- list()
groupSplots8[[1]] <- ggsurvplot(survfit(Surv(time, cens) ~ plat_cat, data = daneKNN),
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE,
           risk.table.col = "strata", 
           linetype = "strata",
           surv.median.line = "hv", 
           ggtheme = theme_minimal(),
           palette = c("peachpuff3","lightgreen", "paleturquoise4", "orchid4"))
groupSplots8[[2]] <- ggsurvplot(survfit(Surv(time, cens) ~ pro_cat, data = daneKNN),
           pval = TRUE, conf.int = TRUE,
           risk.table = TRUE,
           risk.table.col = "strata",
           linetype = "strata",
           surv.median.line = "hv", 
           ggtheme = theme_gray(),
           palette = c("wheat3","seagreen3", "slateblue2", "sienna3"))


arrange_ggsurvplots(groupSplots8, print = TRUE,
  ncol = 2, nrow = 1, risk.table.height = 0.2)

___

Graficzna prezentacjia funkcji skumulowanego hazardu dla wybranych zmiennych

Zmienne sex oraz stage

groupHplots <- list()
groupHplots[[1]] <- ggsurvplot(survfit(Surv(time, cens) ~ sex, data = daneKNN),
           conf.int = TRUE,
           risk.table.col = "strata",
           ggtheme = theme_gray(), 
           palette = c("navajowhite3","plum4"),
           fun = "cumhaz")
groupHplots[[2]] <- ggsurvplot(survfit(Surv(time, cens) ~ stage, data = daneKNN),
           conf.int = TRUE,
           risk.table.col = "strata",
           ggtheme = theme_minimal(), 
           palette = c("palegreen3", "skyblue3","salmon3","brown3"),
           fun = "cumhaz")


arrange_ggsurvplots(groupHplots, print = TRUE,
  ncol = 2, nrow = 1, risk.table.height = 0.2)

Zmienne hepato oraz skategoryzowana zmienna trig

groupHplots <- list()
groupHplots[[1]] <- ggsurvplot(survfit(Surv(time, cens) ~ hepato, data = daneKNN),
           conf.int = TRUE,
           risk.table.col = "strata",
           ggtheme = theme_gray(), 
           palette = c("palegreen4", "indianred3"),
           fun = "cumhaz")
groupHplots[[2]] <- ggsurvplot(survfit(Surv(time, cens) ~ trig_cat, data = daneKNN),
           conf.int = TRUE,
           risk.table.col = "strata",
           ggtheme = theme_minimal(), 
           palette = c("peachpuff3","lightgreen", "paleturquoise4", "orchid4"),
           fun = "cumhaz")


arrange_ggsurvplots(groupHplots, print = TRUE,
  ncol = 2, nrow = 1, risk.table.height = 0.2)

Skategoryzowane zmienne bili i copper

test

groupHplots <- list()
groupHplots[[1]] <- ggsurvplot(survfit(Surv(time, cens) ~ copper_cat, data = daneKNN),
           conf.int = TRUE,
           risk.table.col = "strata",
           ggtheme = theme_gray(), 
           palette = c("wheat3","seagreen3", "slateblue2", "sienna3"),
           fun = "cumhaz")
groupHplots[[2]] <- ggsurvplot(survfit(Surv(time, cens) ~ bili_cat, data = daneKNN),
           conf.int = TRUE,
           risk.table.col = "strata",
           ggtheme = theme_minimal(), 
           palette = c("peachpuff3","lightgreen", "paleturquoise4", "orchid4"),
           fun = "cumhaz")


arrange_ggsurvplots(groupHplots, print = TRUE,
  ncol = 2, nrow = 1, risk.table.height = 0.2)

___

Model PH Coxa

  • Zbuduj model PH Coxa. Uwzględnij następujące zagadnienia:
    • interpretacja uzyskanych oszacowań parametrów modelu
    • weryfikacja założeń modelu (proporcjonalność hazardów)
    • wybór postaci funkcji zmiennych objaśniających
    • selekcja zmiennych do modelu
    • reszty w modelu
    • wybór najlepszego modelu
    • ocena dopasowania modelu

Estymacja modelu

Cox <- coxph(Surv(time,cens)~age+sex+ascites+hepato+spiders+bili+chol+albumin+copper+alk.phos+ast+trig+platelet+protime, data = daneKNN)
summary(Cox)  
## Call:
## coxph(formula = Surv(time, cens) ~ age + sex + ascites + hepato + 
##     spiders + bili + chol + albumin + copper + alk.phos + ast + 
##     trig + platelet + protime, data = daneKNN)
## 
##   n= 312, number of events= 125 
## 
##                coef  exp(coef)   se(coef)      z Pr(>|z|)    
## age       2.706e-02  1.027e+00  1.033e-02  2.619 0.008815 ** 
## sex0     -1.019e-01  9.031e-01  2.972e-01 -0.343 0.731787    
## ascites1  3.428e-01  1.409e+00  3.411e-01  1.005 0.314994    
## hepato1   4.667e-01  1.595e+00  2.257e-01  2.068 0.038625 *  
## spiders1  1.863e-01  1.205e+00  2.222e-01  0.838 0.401818    
## bili      8.917e-02  1.093e+00  2.174e-02  4.102  4.1e-05 ***
## chol      1.604e-04  1.000e+00  3.945e-04  0.407 0.684291    
## albumin  -1.047e+00  3.511e-01  2.608e-01 -4.013  6.0e-05 ***
## copper    2.949e-03  1.003e+00  1.150e-03  2.564 0.010349 *  
## alk.phos -2.074e-05  1.000e+00  3.810e-05 -0.544 0.586184    
## ast       3.255e-03  1.003e+00  1.731e-03  1.880 0.060082 .  
## trig     -1.136e-03  9.989e-01  1.253e-03 -0.906 0.364814    
## platelet -3.698e-04  9.996e-01  1.118e-03 -0.331 0.740890    
## protime   3.053e-01  1.357e+00  8.253e-02  3.700 0.000216 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##          exp(coef) exp(-coef) lower .95 upper .95
## age         1.0274     0.9733    1.0068    1.0484
## sex0        0.9031     1.1072    0.5044    1.6172
## ascites1    1.4088     0.7098    0.7219    2.7494
## hepato1     1.5947     0.6271    1.0247    2.4818
## spiders1    1.2048     0.8300    0.7794    1.8623
## bili        1.0933     0.9147    1.0477    1.1409
## chol        1.0002     0.9998    0.9994    1.0009
## albumin     0.3511     2.8482    0.2106    0.5854
## copper      1.0030     0.9971    1.0007    1.0052
## alk.phos    1.0000     1.0000    0.9999    1.0001
## ast         1.0033     0.9968    0.9999    1.0067
## trig        0.9989     1.0011    0.9964    1.0013
## platelet    0.9996     1.0004    0.9974    1.0018
## protime     1.3571     0.7369    1.1544    1.5953
## 
## Concordance= 0.851  (se = 0.019 )
## Likelihood ratio test= 187.6  on 14 df,   p=<2e-16
## Wald test            = 210.4  on 14 df,   p=<2e-16
## Score (logrank) test = 317.7  on 14 df,   p=<2e-16

Zastosowanie funkcji krokowej, która poszukuje modelu najbardziej zoptymalizowanego ze wzgledu na kryterium akaike AIC (tu - metoda wsteczna)

W wyniku tej procedury otrzymano model z 7 zmiennymi objasniajacymi: age,hepato,bili,albumin,copper,ast,protime. Według kryterium AIC zmienna ast mimo p-value < 0,06 powinna znalezc sie w modelu. Pozostałe zmienne na poziomie istotnosci 0,05 sa statystycznie rozne od 0 (czyli istotne), ponieważ p-value < 0,05.

stepAIC(Cox,direction="backward")
## Start:  AIC=1120.33
## Surv(time, cens) ~ age + sex + ascites + hepato + spiders + bili + 
##     chol + albumin + copper + alk.phos + ast + trig + platelet + 
##     protime
## 
##            Df    AIC
## - platelet  1 1118.4
## - sex       1 1118.5
## - chol      1 1118.5
## - alk.phos  1 1118.6
## - spiders   1 1119.0
## - trig      1 1119.2
## - ascites   1 1119.3
## <none>        1120.3
## - ast       1 1121.7
## - hepato    1 1122.7
## - copper    1 1124.4
## - age       1 1125.2
## - protime   1 1129.6
## - bili      1 1132.2
## - albumin   1 1134.0
## 
## Step:  AIC=1118.44
## Surv(time, cens) ~ age + sex + ascites + hepato + spiders + bili + 
##     chol + albumin + copper + alk.phos + ast + trig + protime
## 
##            Df    AIC
## - chol      1 1116.5
## - sex       1 1116.6
## - alk.phos  1 1116.8
## - spiders   1 1117.2
## - trig      1 1117.5
## - ascites   1 1117.5
## <none>        1118.4
## - ast       1 1120.6
## - hepato    1 1121.2
## - copper    1 1122.4
## - age       1 1123.5
## - protime   1 1127.9
## - bili      1 1130.4
## - albumin   1 1132.3
## 
## Step:  AIC=1116.53
## Surv(time, cens) ~ age + sex + ascites + hepato + spiders + bili + 
##     albumin + copper + alk.phos + ast + trig + protime
## 
##            Df    AIC
## - sex       1 1114.7
## - alk.phos  1 1114.9
## - spiders   1 1115.3
## - trig      1 1115.5
## - ascites   1 1115.6
## <none>        1116.5
## - ast       1 1119.0
## - hepato    1 1119.3
## - copper    1 1120.4
## - age       1 1121.5
## - protime   1 1125.9
## - bili      1 1130.2
## - albumin   1 1130.3
## 
## Step:  AIC=1114.7
## Surv(time, cens) ~ age + ascites + hepato + spiders + bili + 
##     albumin + copper + alk.phos + ast + trig + protime
## 
##            Df    AIC
## - alk.phos  1 1113.1
## - spiders   1 1113.4
## - trig      1 1113.6
## - ascites   1 1113.9
## <none>        1114.7
## - ast       1 1117.5
## - hepato    1 1117.6
## - copper    1 1120.2
## - age       1 1121.0
## - protime   1 1124.0
## - bili      1 1128.2
## - albumin   1 1128.4
## 
## Step:  AIC=1113.14
## Surv(time, cens) ~ age + ascites + hepato + spiders + bili + 
##     albumin + copper + ast + trig + protime
## 
##           Df    AIC
## - trig     1 1112.1
## - spiders  1 1112.2
## - ascites  1 1112.7
## <none>       1113.1
## - hepato   1 1116.0
## - ast      1 1116.1
## - copper   1 1118.2
## - age      1 1120.6
## - protime  1 1122.1
## - albumin  1 1126.4
## - bili     1 1126.5
## 
## Step:  AIC=1112.11
## Surv(time, cens) ~ age + ascites + hepato + spiders + bili + 
##     albumin + copper + ast + protime
## 
##           Df    AIC
## - ascites  1 1111.0
## - spiders  1 1111.2
## <none>       1112.1
## - hepato   1 1114.9
## - ast      1 1115.7
## - copper   1 1117.2
## - age      1 1120.9
## - protime  1 1121.9
## - bili     1 1125.2
## - albumin  1 1125.2
## 
## Step:  AIC=1111.05
## Surv(time, cens) ~ age + hepato + spiders + bili + albumin + 
##     copper + ast + protime
## 
##           Df    AIC
## - spiders  1 1110.4
## <none>       1111.0
## - hepato   1 1113.6
## - ast      1 1114.2
## - copper   1 1118.5
## - protime  1 1121.7
## - age      1 1121.9
## - bili     1 1126.2
## - albumin  1 1128.5
## 
## Step:  AIC=1110.39
## Surv(time, cens) ~ age + hepato + bili + albumin + copper + ast + 
##     protime
## 
##           Df    AIC
## <none>       1110.4
## - ast      1 1113.4
## - hepato   1 1114.0
## - copper   1 1119.1
## - age      1 1120.7
## - protime  1 1122.4
## - bili     1 1127.5
## - albumin  1 1129.1
## Call:
## coxph(formula = Surv(time, cens) ~ age + hepato + bili + albumin + 
##     copper + ast + protime, data = daneKNN)
## 
##               coef  exp(coef)   se(coef)      z        p
## age      0.0326504  1.0331893  0.0092752  3.520 0.000431
## hepato1  0.5080490  1.6620454  0.2174273  2.337 0.019458
## bili     0.0854208  1.0891753  0.0177598  4.810 1.51e-06
## albumin -1.1054157  0.3310732  0.2348614 -4.707 2.52e-06
## copper   0.0032776  1.0032830  0.0009424  3.478 0.000505
## ast      0.0037099  1.0037168  0.0015580  2.381 0.017255
## protime  0.3220291  1.3799250  0.0778588  4.136 3.53e-05
## 
## Likelihood ratio test=183.5  on 7 df, p=< 2.2e-16
## n= 312, number of events= 125

Estymacja nowego modelu Coxa po odrzuceniu zmiennych nieistotnych statystycznie

  • INTERPRETACJA PARAMETROW:
    • jezeli wiek wzrosnie o rok, hazard (ryzyko) zgonu rośnie o 3,3% u osob bez hepatomegalii lub powiększenia wątroby, ceteris paribus (przy pozostalych wartosciach zmiennych takich samych)
    • pacjenci z obecnościa hepatomegalii lub powiększenia wątroby(hepato1) maja hazard (ryzyko) zgonu o 66,2% wyzszy niz pacjenci z hepato = 0 ceteris paribus
    • wraz ze wzrostem bili o 1, hazard (ryzyko) zgonu rosnie o 8,9% ceteris paribus
    • wraz ze wzrostem albumin o 1, hazard (ryzyko) zgonu maleje o 67%, ceteris paribus
    • wraz ze wzrostem copper o 1, hazard (ryzyko) zgonu rosnie o 0,3% ceteris paribus
    • wraz ze wzrostem ast o 1, hazard (ryzyko) zgonu rosnie o 0,4% ceteris paribus
    • wraz ze wzrostem protime o 1, hazard (ryzyko) zgonu rosnie o 38,0% ceteris paribus
Cox1 <- coxph(Surv(time,cens)~age+hepato+bili+albumin+copper+ast+protime, data = daneKNN)
summary(Cox1)  
## Call:
## coxph(formula = Surv(time, cens) ~ age + hepato + bili + albumin + 
##     copper + ast + protime, data = daneKNN)
## 
##   n= 312, number of events= 125 
## 
##               coef  exp(coef)   se(coef)      z Pr(>|z|)    
## age      0.0326504  1.0331893  0.0092752  3.520 0.000431 ***
## hepato1  0.5080490  1.6620454  0.2174273  2.337 0.019458 *  
## bili     0.0854208  1.0891753  0.0177598  4.810 1.51e-06 ***
## albumin -1.1054157  0.3310732  0.2348614 -4.707 2.52e-06 ***
## copper   0.0032776  1.0032830  0.0009424  3.478 0.000505 ***
## ast      0.0037099  1.0037168  0.0015580  2.381 0.017255 *  
## protime  0.3220291  1.3799250  0.0778588  4.136 3.53e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## age        1.0332     0.9679    1.0146    1.0521
## hepato1    1.6620     0.6017    1.0853    2.5452
## bili       1.0892     0.9181    1.0519    1.1278
## albumin    0.3311     3.0205    0.2089    0.5246
## copper     1.0033     0.9967    1.0014    1.0051
## ast        1.0037     0.9963    1.0007    1.0068
## protime    1.3799     0.7247    1.1846    1.6074
## 
## Concordance= 0.845  (se = 0.019 )
## Likelihood ratio test= 183.5  on 7 df,   p=<2e-16
## Wald test            = 199.3  on 7 df,   p=<2e-16
## Score (logrank) test = 276.9  on 7 df,   p=<2e-16

Weryfikacja założenia proporcjonalności

Jeżeli p > 0,05 to zostaly spełnione zalożenia proporcjalosci. Dla zmiennych: bili oraz protime te założenia nie zostały spełnione, więc należy usunąć te zmienne z modelu.

Prop <- cox.zph(Cox1, transform = 'km')
print(Prop)
##           chisq df      p
## age      0.3590  1 0.5491
## hepato   0.8706  1 0.3508
## bili     7.3198  1 0.0068
## albumin  1.1252  1 0.2888
## copper   0.0145  1 0.9043
## ast      1.9955  1 0.1578
## protime  5.2823  1 0.0215
## GLOBAL  16.9008  7 0.0180
plot(Prop)  

### Estymacja modelu z wykorzystaniem tylko istotnych zmiennych

Cox2 <- coxph(Surv(time,cens)~age+hepato+albumin+copper+ast, data = daneKNN)
summary(Cox2) 
## Call:
## coxph(formula = Surv(time, cens) ~ age + hepato + albumin + copper + 
##     ast, data = daneKNN)
## 
##   n= 312, number of events= 125 
## 
##               coef  exp(coef)   se(coef)      z Pr(>|z|)    
## age      0.0330674  1.0336202  0.0088058  3.755 0.000173 ***
## hepato1  0.7046633  2.0231653  0.2112452  3.336 0.000851 ***
## albumin -1.2051939  0.2996339  0.2336910 -5.157 2.51e-07 ***
## copper   0.0046068  1.0046175  0.0008553  5.386 7.19e-08 ***
## ast      0.0047647  1.0047761  0.0012846  3.709 0.000208 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## age        1.0336     0.9675    1.0159    1.0516
## hepato1    2.0232     0.4943    1.3373    3.0609
## albumin    0.2996     3.3374    0.1895    0.4737
## copper     1.0046     0.9954    1.0029    1.0063
## ast        1.0048     0.9952    1.0022    1.0073
## 
## Concordance= 0.824  (se = 0.02 )
## Likelihood ratio test= 140.4  on 5 df,   p=<2e-16
## Wald test            = 162.9  on 5 df,   p=<2e-16
## Score (logrank) test = 174.9  on 5 df,   p=<2e-16

Graficzna weryfikacja założenia proporcjonalności

reszty <- resid(Cox1, type="scaledsch") 
Time <- as.numeric(rownames(reszty))
zmienne <- names(daneKNN[,c(5,8,11,13,14,16,19)])

par(mfrow = c(3,3))
for (i in 1:7) {
  plot(log(Time), reszty[,i], xlab="ln(Czas)", main=zmienne[i],
       ylab="Skalowane reszty Schoenfelda", pch=20, cex=0.7)
  lines(smooth.spline(log(Time), reszty[,i] ), lwd=3 )
}

### Identyfikacja jednostek odstających

W zbiorze znajduje się jedna jednostką odstająca.

deviance <- residuals(Cox2,type="deviance")
s <- Cox2$linear.predictors
plot(s,deviance,xlab="Liniowy predyktor",ylab="Reszty odchylen",cex=0.5, pch=20)
abline(h=c(3,-3),lty=3)

daneKNN$deviance <- deviance
c1 <- which(daneKNN$deviance < c(-3))

Jednostki wpływowe

  • INTERPRETACJA PARAMETROW:
    • jezeli wiek wzrosnie o 1rok, hazard (ryzyko) zgonu rośnie o 4,9% u osob bez hepatomegalii lub powiększenia wątroby, ceteris paribus (przy pozostalych wartosciach zmiennych takich samych)
    • pacjenci z obecnościa hepatomegalii lub powiększenia wątroby(hepato1) maja hazard (ryzyko) zgonu o 100,5% wyzszy niz pacjenci z hepato = 0 ceteris paribus
    • wraz ze wzrostem albumin o 1, hazard (ryzyko) zgonu ?
    • wraz ze wzrostem copper o 1, hazard (ryzyko) zgonu rosnie o 0,6% ceteris paribus
    • wraz ze wzrostem ast o 1, hazard (ryzyko) zgonu rosnie o 0,8% ceteris paribus
dfb <- residuals(Cox2,type="dfbeta")
n <- dim(dfb)[1]
obs.nr <- c(1:n)
par(mfrow = c(2,3))
for (j in 1:5) {
  plot(obs.nr,dfb[,j],xlab="Numer jednostki",ylab="Przyrost oceny parametru",
       main=zmienne[j])
}

a1 <- which(abs(dfb[,1])>(0.004)) #usunac te jednostki
a2 <- which(abs(dfb[,2])>(0.06))
a3 <- which(abs(dfb[,3])>(0.1))
a4 <- which(abs(dfb[,4])>(0.0004))
a5 <- which(abs(dfb[,5])>(0.001))

c <- sort(unique(c(a1,a2,a3,a4,a5,c1)))
daneKNN_1 <- daneKNN[-c,] #zredukowany zbior

Cox3 <- coxph(Surv(time,cens)~age+hepato+albumin+copper+ast, data = daneKNN_1)
summary(Cox3)
## Call:
## coxph(formula = Surv(time, cens) ~ age + hepato + albumin + copper + 
##     ast, data = daneKNN_1)
## 
##   n= 307, number of events= 124 
## 
##               coef  exp(coef)   se(coef)      z Pr(>|z|)    
## age      0.0486020  1.0498024  0.0098801  4.919 8.69e-07 ***
## hepato1  0.6956171  2.0049458  0.2086548  3.334 0.000857 ***
## albumin -1.4274764  0.2399136  0.2545430 -5.608 2.05e-08 ***
## copper   0.0064491  1.0064699  0.0009382  6.874 6.25e-12 ***
## ast      0.0082022  1.0082359  0.0016792  4.885 1.04e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## age        1.0498     0.9526    1.0297    1.0703
## hepato1    2.0049     0.4988    1.3320    3.0179
## albumin    0.2399     4.1682    0.1457    0.3951
## copper     1.0065     0.9936    1.0046    1.0083
## ast        1.0082     0.9918    1.0049    1.0116
## 
## Concordance= 0.836  (se = 0.019 )
## Likelihood ratio test= 180.3  on 5 df,   p=<2e-16
## Wald test            = 187.2  on 5 df,   p=<2e-16
## Score (logrank) test = 217.2  on 5 df,   p=<2e-16
stepAIC(Cox3,direction="backward")
## Start:  AIC=1093.72
## Surv(time, cens) ~ age + hepato + albumin + copper + ast
## 
##           Df    AIC
## <none>       1093.7
## - hepato   1 1103.4
## - ast      1 1113.2
## - age      1 1116.1
## - albumin  1 1122.8
## - copper   1 1130.0
## Call:
## coxph(formula = Surv(time, cens) ~ age + hepato + albumin + copper + 
##     ast, data = daneKNN_1)
## 
##               coef  exp(coef)   se(coef)      z        p
## age      0.0486020  1.0498024  0.0098801  4.919 8.69e-07
## hepato1  0.6956171  2.0049458  0.2086548  3.334 0.000857
## albumin -1.4274764  0.2399136  0.2545430 -5.608 2.05e-08
## copper   0.0064491  1.0064699  0.0009382  6.874 6.25e-12
## ast      0.0082022  1.0082359  0.0016792  4.885 1.04e-06
## 
## Likelihood ratio test=180.3  on 5 df, p=< 2.2e-16
## n= 307, number of events= 124

### Reszty martyngałowe dla zmiennej age

zmn1 <- daneKNN_1$age
lab1 <- "Age (years)"
reszty1 <- resid(coxph(Surv(time,cens)~1,data=daneKNN_1),type="martingale")
plot(zmn1, reszty1, xlab=lab1,ylab="Martingale residuals",cex=0.6)
lines(lowess(zmn1, reszty1,delta=1),lwd=2)

### Reszty martyngałowe dla zmiennej albumin

zmn2 <- daneKNN_1$albumin
lab2 <- "albumin"
reszty2 <- resid(coxph(Surv(time,cens)~1,data=daneKNN_1),type="martingale")
plot(zmn2, reszty2, xlab=lab2,ylab="Reszty martynga?owe",cex=0.6)
lines(lowess(zmn2, reszty2,delta=1),lwd=2)

Reszty martyngałowe dla zmiennej copper

zmn3 <- daneKNN_1$copper
lab3 <- "copper"
reszty3 <- resid(coxph(Surv(time,cens)~1,data=daneKNN_1),type="martingale")
plot(zmn3, reszty3, xlab=lab3,ylab="Reszty martynga?owe",cex=0.6)
lines(lowess(zmn3, reszty3,delta=1),lwd=2)

Reszty martyngałowe dla zmiennej ast

zmn4 <- daneKNN_1$ast
lab4 <- "ast"
reszty4 <- resid(coxph(Surv(time,cens)~1,data=daneKNN_1),type="martingale")
plot(zmn4, reszty4, xlab=lab4,ylab="Reszty martynga?owe",cex=0.6)
lines(lowess(zmn4, reszty4,delta=1),lwd=2)

### Transformacja zmiennej copper

Cox4 <- coxph(Surv(time, cens)~zmn3,data=daneKNN_1)
A1 <- round(AIC(Cox3),2) #akaike z modelu coxa
A1
## [1] 1093.72
cutpoint <- cutp(Cox4)$zmn3 ## optymalny punkt odciecia
c <- (zmn3>=cutpoint$zmn3[1])*1+0 #jezeli zmienna jest >= punkotwi odciecia to przypisuje 1 w przeciwnym wypadku 0

Cox5 <- coxph(Surv(time, cens)~c,data=daneKNN_1)
A2 <- round(AIC(Cox5),2)
A2
## [1] 1201.05

Zastosowanie wielomianów ułamkowych

m3 <- mfp(Surv(time,cens)~ fp(zmn3, df = 4, select = 0.05),family=cox, data=daneKNN_1) ## wielomiany ulamkowe
m3
## Call:
## mfp(formula = Surv(time, cens) ~ fp(zmn3, df = 4, select = 0.05), 
##     data = daneKNN_1, family = cox)
## 
## 
## Deviance table:
##           Resid. Dev
## Null model    1263.992
## Linear model  1191.1
## Final model   1179.526
## 
## Fractional polynomials:
##      df.initial select alpha df.final power1 power2
## zmn3          4   0.05  0.05        2    0.5      .
## 
## 
## Transformations of covariates:
##                formula
## zmn3 I((zmn3/100)^0.5)
## 
##        coef exp(coef) se(coef)     z p
## zmn3.1 0.22     1.246  0.02197 10.01 0
## 
## Likelihood ratio test=84.47  on 1 df, p=0 n= 307
A3 <- round(AIC(m3))
A3
## [1] 1182

Metoda wielomianowej funkcji sklejanej (splajn)

int <- quantile(na.omit(zmn3),probs=c(0.05,0.275,.5,.725,.95))
Cox6 <- coxph(Surv(time,cens)~ns(zmn3,knots=int),data=daneKNN_1)
pred <- predict(Cox6,type="terms",se.fit=TRUE, terms=1)
plot(na.omit(zmn3),exp(pred$fit),type="n",xlab=lab3,ylim=c(0,2.5),
     ylab="Hazard względny")
lines(smooth.spline(na.omit(zmn3),exp(pred$fit+1.96*pred$se.fit)),lty=2)
lines(smooth.spline(na.omit(zmn3),exp(pred$fit-1.96*pred$se.fit)),lty=2)
lines(smooth.spline(na.omit(zmn3),exp(pred$fit)),lty=1)
abline(h=1,lty=3)
legend('topright',2,c("splajn","95% przedzial ufnosci"), lty=1:2, box.lty=0)

A4 <- round(AIC(Cox6),2)
A4
## [1] 1187.47

PODSUMOWANIE TRANSFORMACJI ZMIENNEJ copper

Mimo wszystko najlepszym modelem jest model bez transformacji zmiennej copper, czyli:

rbind(A1,A2,A3,A4)
##       [,1]
## A1 1093.72
## A2 1201.05
## A3 1182.00
## A4 1187.47
Cox3 <- coxph(Surv(time,cens)~age+hepato+albumin+copper+ast, data = daneKNN_1)
summary(Cox3)
## Call:
## coxph(formula = Surv(time, cens) ~ age + hepato + albumin + copper + 
##     ast, data = daneKNN_1)
## 
##   n= 307, number of events= 124 
## 
##               coef  exp(coef)   se(coef)      z Pr(>|z|)    
## age      0.0486020  1.0498024  0.0098801  4.919 8.69e-07 ***
## hepato1  0.6956171  2.0049458  0.2086548  3.334 0.000857 ***
## albumin -1.4274764  0.2399136  0.2545430 -5.608 2.05e-08 ***
## copper   0.0064491  1.0064699  0.0009382  6.874 6.25e-12 ***
## ast      0.0082022  1.0082359  0.0016792  4.885 1.04e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## age        1.0498     0.9526    1.0297    1.0703
## hepato1    2.0049     0.4988    1.3320    3.0179
## albumin    0.2399     4.1682    0.1457    0.3951
## copper     1.0065     0.9936    1.0046    1.0083
## ast        1.0082     0.9918    1.0049    1.0116
## 
## Concordance= 0.836  (se = 0.019 )
## Likelihood ratio test= 180.3  on 5 df,   p=<2e-16
## Wald test            = 187.2  on 5 df,   p=<2e-16
## Score (logrank) test = 217.2  on 5 df,   p=<2e-16

MIARY DOPASOWANIA

r2_1 <- coxr2(Cox1) #model otrzymany metoda krokowa przed sprawdzeniem zalozen proporcjonalnosci
r2_2 <- coxr2(Cox2) #model po odrzuceniu zmiennych niespelniajacych zalozen proporcjonalnosci
r2_3 <- coxr2(Cox3) #model po odrzuceniu jednostek odstajacych i wplywowych (uznany za najlepszy)
round(rbind(r2_1$rsq, r2_2$rsq, r2_3$rsq),2)
##       rsq
## [1,] 0.77
## [2,] 0.67
## [3,] 0.77